library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
library(htmltools)

Setup

options = params$options
targetVar = params$targetVar
targetUnits = ifelse(targetVar=='Throughput','MB/s','secs')
showOutliers = params$showOutliers
outliersQuantile = ifelse(showOutliers, 1 ,0.90)  #use 1.0 to include all outliers
outliersShape = ifelse(showOutliers,  19,NA)  
plotCols=2

Reading Files

read.clean.files = function(filename){
  file = read.csv(filename, header = FALSE)
  column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken")
  colnames(file) = column.names
  return(file)
}

files = list.files(path = "../Results/", pattern = ".csv$", recursive = TRUE, full.names = TRUE) # List all .csv files
#files

databricks.files = files[grepl("Databricks",files)]
local.vm..files = files[grepl("Local_VM",files)]
cluster.files = files[grepl("Cluster",files)]

rows.databricks = lapply(databricks.files, read.csv, header = FALSE) # Read the files into list
merged.databricks = do.call(rbind, rows.databricks) # combine the data.frame
merged.databricks$Setup = 'Databricks'

rows.local.vm = lapply(local.vm..files, read.csv, header = FALSE) # Read the files into list
merged.local.vm = do.call(rbind, rows.local.vm) # combine the data.frame
merged.local.vm$Setup = 'Local VM'

rows.cluster = lapply(cluster.files, read.csv, header = FALSE) # Read the files into list
merged.cluster = do.call(rbind, rows.cluster) # combine the data.frame
merged.cluster$Setup = 'Cluster'


merged_data = rbind(merged.databricks,merged.local.vm,merged.cluster)
merged_data$Setup = as.factor(merged_data$Setup)

column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken","Setup")
colnames(merged_data) = column.names
merged_data$Type = as.factor(gsub(pattern = "Operations", replacement = "Operation", x = merged_data$Type))

# Convert columns to factors
merged_data$MachineID = as.factor(merged_data$MachineID)
merged_data$Randomize = as.factor(merged_data$Randomize)
merged_data$RunID = as.factor(merged_data$RunID)

merged_data$Dataset = sub("dataset_", "", merged_data$Dataset) 
merged_data$Dataset = sub("MB$", "", merged_data$Dataset) 
merged_data$Dataset = as.factor(merged_data$Dataset)

merged_data$Operation = trimws(as.character(merged_data$Operation),'both')
merged_data[merged_data$Operation =='Mathematical Operations on Columns',]$Operation ='Mathematical Operation on Columns'

merged_data$subType = ''
merged_data[grepl('^Sorting',merged_data$Operation),]$subType = 'Sorting'
merged_data[grepl('^Ranking',merged_data$Operation),]$subType = 'Ranking'
merged_data[grepl('^Split',merged_data$Operation),]$subType = 'Splitting'
merged_data[grepl('^GroupBy',merged_data$Operation),]$subType = 'Grouping'
merged_data[grepl(' Join ',merged_data$Operation),]$subType = 'Joining'
merged_data[grepl('^Merge',merged_data$Operation),]$subType = 'Merging'
merged_data[grepl('^Filter',merged_data$Operation),]$subType = 'Filtering'
merged_data[grepl('^Mathematical',merged_data$Operation),]$subType = 'Mathematics'
merged_data[grepl('^Pivot',merged_data$Operation),]$subType = 'Pivots'
merged_data[grepl('^Running|^Shift',merged_data$Operation),]$subType = 'Run/Shift'
merged_data[grepl('^Writing',merged_data$Operation),]$subType = 'Writing'
size_10MB =  11.4789848327637 # file.size("../../Data/Databricks/machine2/dataset_10MB.csv")/(1024*1024)
size_100MB = 115.640992164612 # file.size("../../Data/Databricks/machine2/dataset_100MB.csv")/(1024*1024) 
size_200MB = 229.8573  
size_300MB = 343.2709
size_500MB = 576.678165435791 # file.size("../../Data/Databricks/machine2/dataset_500MB.csv")/(1024*1024) 

print(paste("Actual Size of 10MB file (in MB)",size_10MB))
## [1] "Actual Size of 10MB file (in MB) 11.4789848327637"
print(paste("Actual Size of 100MB file (in MB)",size_100MB))
## [1] "Actual Size of 100MB file (in MB) 115.640992164612"
print(paste("Actual Size of 200MB file (in MB)",size_200MB))
## [1] "Actual Size of 200MB file (in MB) 229.8573"
print(paste("Actual Size of 300MB file (in MB)",size_300MB))
## [1] "Actual Size of 300MB file (in MB) 343.2709"
print(paste("Actual Size of 500MB file (in MB)",size_500MB))
## [1] "Actual Size of 500MB file (in MB) 576.678165435791"
size_info = data.frame(Dataset = c("10","100","200","300","500")
                       ,Size = c(size_10MB,size_100MB,size_200MB,size_300MB,size_500MB))
str(size_info)
## 'data.frame':    5 obs. of  2 variables:
##  $ Dataset: Factor w/ 5 levels "10","100","200",..: 1 2 3 4 5
##  $ Size   : num  11.5 115.6 229.9 343.3 576.7
merged_data = merged_data %>%
  merge(size_info,by='Dataset') %>%
  mutate(Throughput = Size/TimeTaken)

Setups = c(unique(as.character(merged_data$Setup)))
data_raw = merged_data
merged_data = merged_data %>% 
    filter(RunID != 1)

str(merged_data)
## 'data.frame':    8357 obs. of  12 variables:
##  $ Dataset   : Factor w/ 5 levels "10","100","200",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Language  : Factor w/ 2 levels "PySpark","Scala": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Randomize : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ MachineID : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ RunID     : Factor w/ 6 levels "1","2","3","4",..: 6 6 5 6 6 6 6 5 5 5 ...
##  $ Type      : Factor w/ 4 levels "Aggregate Operation",..: 2 3 4 4 2 2 2 2 2 3 ...
##  $ Operation : chr  "Merge 5 columns into 1" "Pivot 10 Rows and 1 Column" "Writing 10000 new rows" "Writing 10000 new rows" ...
##  $ TimeTaken : num  2.33 12.4 3.73 3.01 4.82 ...
##  $ Setup     : Factor w/ 3 levels "Cluster","Databricks",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ subType   : chr  "Merging" "Pivots" "Writing" "Writing" ...
##  $ Size      : num  11.5 11.5 11.5 11.5 11.5 ...
##  $ Throughput: num  4.924 0.926 3.076 3.811 2.379 ...
head(merged_data)
##   Dataset Language Randomize MachineID RunID             Type
## 1      10  PySpark         1         1     6 Column Operation
## 2      10  PySpark         1         1     6  Mixed Operation
## 3      10  PySpark         1         1     5    Row Operation
## 4      10  PySpark         1         1     6    Row Operation
## 5      10  PySpark         1         1     6 Column Operation
## 6      10  PySpark         1         1     6 Column Operation
##                    Operation TimeTaken      Setup subType     Size
## 1     Merge 5 columns into 1  2.331200 Databricks Merging 11.47898
## 2 Pivot 10 Rows and 1 Column 12.402202 Databricks  Pivots 11.47898
## 3     Writing 10000 new rows  3.732033 Databricks Writing 11.47898
## 4     Writing 10000 new rows  3.011919 Databricks Writing 11.47898
## 5  Left Outer Join 5 Columns  4.824355 Databricks Joining 11.47898
## 6  Left Outer Join 3 Columns 10.807082 Databricks Joining 11.47898
##   Throughput
## 1  4.9240672
## 2  0.9255602
## 3  3.0757992
## 4  3.8111861
## 5  2.3793822
## 6  1.0621724
summary(merged_data)
##  Dataset       Language    Randomize MachineID RunID   
##  10 :2100   PySpark:4697   1:8357    1:5357    1:   0  
##  100:1950   Scala  :3660             2:3000    2:1890  
##  200:1485                                      3:1863  
##  300:1472                                      4:1647  
##  500:1350                                      5:1485  
##                                                6:1472  
##                   Type       Operation           TimeTaken      
##  Aggregate Operation:1118   Length:8357        Min.   :  0.230  
##  Column Operation   :4164   Class :character   1st Qu.:  3.208  
##  Mixed Operation    : 839   Mode  :character   Median : 10.105  
##  Row Operation      :2236                      Mean   : 25.053  
##                                                3rd Qu.: 25.786  
##                                                Max.   :631.088  
##         Setup        subType               Size          Throughput     
##  Cluster   :1500   Length:8357        Min.   : 11.48   Min.   :  0.273  
##  Databricks:3317   Class :character   1st Qu.: 11.48   1st Qu.:  5.535  
##  Local VM  :3540   Mode  :character   Median :229.86   Median : 12.702  
##                                       Mean   :224.33   Mean   : 23.838  
##                                       3rd Qu.:343.27   3rd Qu.: 24.951  
##                                       Max.   :576.68   Max.   :268.098
print(!(.Platform$GUI == "RStudio"))
## [1] TRUE

Common Functions

subchunkify_clear <- function() {
  files=dir(path='figure/',pattern='sub_chunk_',include.dirs=T,full.names = T)
  all(file.remove(files))
}

subchunkify <- function(g, fig_height=7, fig_width=5, enabled = !(.Platform$GUI == "RStudio")) {
  if (enabled==FALSE) return(plot(g))
  g_deparsed <- paste0(deparse(
    function() {g}
  ), collapse = '')
  
  sub_chunk <- paste0("
  `","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=", fig_height, ", fig.width=", fig_width, ", echo=FALSE}",
  "\n(", 
    g_deparsed
    , ")()",
  "\n`","``
  ")
  
  cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
}


ggplot_colors = function(plot,strip_angle=0,...){
  plot +
  scale_color_manual(values=c("#ca0020","#0571b0"),breaks=c('PySpark','Scala'))+
  scale_fill_manual(values=c("#f4a582","#92c5de"),breaks=c('PySpark','Scala'))+
    theme_light() +
    ggplot2::theme(strip.text.y=element_text(angle=strip_angle)
                   ,strip.background = element_rect(fill='#636363')
                   ,...)
  }

Simple Linear Regression Model

using a simple LM we can identify the elements that contribute to the speed of the queries, we see Scala has a coeffiecnet of -7, meaning that it bring an overall benefits of 7 seconds, keeping constant the other variables.

subchunkify_clear()
## [1] TRUE
form=as.formula(paste0(targetVar,' ~ Dataset + Language  + Operation + Setup'))
model = lm(data=merged_data,formula=form)
summary(model)
## 
## Call:
## lm(formula = form, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.528  -8.215  -0.664   7.298 195.047 
## 
## Coefficients:
##                                            Estimate Std. Error t value
## (Intercept)                                 49.8253     1.4343  34.738
## Dataset100                                  16.6170     0.6705  24.783
## Dataset200                                  18.8642     0.7374  25.581
## Dataset300                                  17.0566     0.7393  23.070
## Dataset500                                  18.2446     0.7578  24.077
## LanguageScala                                9.8842     0.4644  21.283
## OperationFilter Reg Ex 1                    14.4826     1.7726   8.170
## OperationFilter Reg Ex 2                    15.0320     1.7726   8.480
## OperationFull Outer Join 10 Columns        -35.4569     2.3737 -14.938
## OperationFull Outer Join 3 Columns         -30.6454     3.2451  -9.444
## OperationFull Outer Join 5 Columns         -37.4714     2.3737 -15.786
## OperationGroupBy 1 column                  -38.2980     1.7726 -21.605
## OperationGroupBy 10 columns                -43.4185     1.7710 -24.516
## OperationGroupBy 5 columns                 -41.4954     1.7726 -23.409
## OperationInner Join 10 Columns             -33.1897     2.3737 -13.982
## OperationInner Join 3 Columns              -29.5554     3.2451  -9.108
## OperationInner Join 5 Columns              -34.4918     2.3737 -14.531
## OperationLeft Outer Join 10 Columns        -33.6506     2.3737 -14.177
## OperationLeft Outer Join 3 Columns         -29.5222     3.2451  -9.097
## OperationLeft Outer Join 5 Columns         -36.0190     2.3737 -15.174
## OperationMathematical Operation on Columns  54.0242     1.7726  30.477
## OperationMerge 10 columns into 1           -27.4617     1.7726 -15.492
## OperationMerge 2 columns into 1            -25.5555     1.7710 -14.430
## OperationMerge 5 columns into 1            -26.4484     1.7710 -14.934
## OperationPivot 1 Rows and 1 Column         -10.8216     1.7710  -6.110
## OperationPivot 10 Rows and 1 Column        -32.0375     1.7710 -18.090
## OperationPivot 5 Rows and 1 Column         -28.5859     1.7726 -16.126
## OperationRanking by Group                  -31.6886     1.7710 -17.893
## OperationRunning Sum                         2.7282     1.7710   1.540
## OperationShift (Lag)                         5.6180     1.7710   3.172
## OperationSorting Asc 1 column              -37.3011     1.7710 -21.062
## OperationSorting Asc 10 column             -37.8982     1.7710 -21.399
## OperationSorting Asc 5 column              -37.4642     1.7726 -21.135
## OperationSorting Desc 1 column             -37.2524     1.7726 -21.015
## OperationSorting Desc 10 column            -37.8062     1.7726 -21.328
## OperationSorting Desc 5 column             -37.4615     1.7726 -21.133
## OperationSplit 1 Column into 10            -32.1950     1.7710 -18.179
## OperationSplit 1 Column into 5             -29.7700     1.7710 -16.809
## OperationWriting 100 new rows              -22.7522     1.7710 -12.847
## OperationWriting 1000 new rows             -23.0391     1.7726 -12.997
## OperationWriting 10000 new rows            -25.4278     1.7710 -14.357
## SetupDatabricks                            -22.4437     0.6524 -34.403
## SetupLocal VM                              -28.6507     0.6470 -44.284
##                                            Pr(>|t|)    
## (Intercept)                                 < 2e-16 ***
## Dataset100                                  < 2e-16 ***
## Dataset200                                  < 2e-16 ***
## Dataset300                                  < 2e-16 ***
## Dataset500                                  < 2e-16 ***
## LanguageScala                               < 2e-16 ***
## OperationFilter Reg Ex 1                   3.53e-16 ***
## OperationFilter Reg Ex 2                    < 2e-16 ***
## OperationFull Outer Join 10 Columns         < 2e-16 ***
## OperationFull Outer Join 3 Columns          < 2e-16 ***
## OperationFull Outer Join 5 Columns          < 2e-16 ***
## OperationGroupBy 1 column                   < 2e-16 ***
## OperationGroupBy 10 columns                 < 2e-16 ***
## OperationGroupBy 5 columns                  < 2e-16 ***
## OperationInner Join 10 Columns              < 2e-16 ***
## OperationInner Join 3 Columns               < 2e-16 ***
## OperationInner Join 5 Columns               < 2e-16 ***
## OperationLeft Outer Join 10 Columns         < 2e-16 ***
## OperationLeft Outer Join 3 Columns          < 2e-16 ***
## OperationLeft Outer Join 5 Columns          < 2e-16 ***
## OperationMathematical Operation on Columns  < 2e-16 ***
## OperationMerge 10 columns into 1            < 2e-16 ***
## OperationMerge 2 columns into 1             < 2e-16 ***
## OperationMerge 5 columns into 1             < 2e-16 ***
## OperationPivot 1 Rows and 1 Column         1.04e-09 ***
## OperationPivot 10 Rows and 1 Column         < 2e-16 ***
## OperationPivot 5 Rows and 1 Column          < 2e-16 ***
## OperationRanking by Group                   < 2e-16 ***
## OperationRunning Sum                        0.12349    
## OperationShift (Lag)                        0.00152 ** 
## OperationSorting Asc 1 column               < 2e-16 ***
## OperationSorting Asc 10 column              < 2e-16 ***
## OperationSorting Asc 5 column               < 2e-16 ***
## OperationSorting Desc 1 column              < 2e-16 ***
## OperationSorting Desc 10 column             < 2e-16 ***
## OperationSorting Desc 5 column              < 2e-16 ***
## OperationSplit 1 Column into 10             < 2e-16 ***
## OperationSplit 1 Column into 5              < 2e-16 ***
## OperationWriting 100 new rows               < 2e-16 ***
## OperationWriting 1000 new rows              < 2e-16 ***
## OperationWriting 10000 new rows             < 2e-16 ***
## SetupDatabricks                             < 2e-16 ***
## SetupLocal VM                               < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.94 on 8314 degrees of freedom
## Multiple R-squared:  0.6093, Adjusted R-squared:  0.6073 
## F-statistic: 308.7 on 42 and 8314 DF,  p-value: < 2.2e-16

Comparing of Environments

Comparing Local VM vs. Cloud similar in configuration, controlling by the type operation. The cloud machine is optimized for Scala and Spark, while the local machine has a standard installation of the softwars.

We notice the DataBricks environment benefit the aggregate operation, and that Databricks beenfits the scala encironemtn for the Row Operations. Anyway, other sitation shows a similar perfomance across lancuages and environment

filtered= merged_data 
p=ggplot(data=filtered, aes_string(x='Setup',y=targetVar,color='Language',fill='Language')) + 
    geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape = outliersShape) +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
   facet_wrap(~Type,scales='free',ncol=plotCols) 
  ggplot_colors(p,axis.text.y=element_text(size=7),legend.position='right')
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

Comparing different data sizes

Comparing the peromance of the differnt queries among different sizes of dataset. We notice that the increased size has an import impact on the time. PySpark is suffering more by the increased size, mainly for Row and Column Operations, while Scala language shows a better performance. We can see

plotDataSize=function(filtered,title){
  t = filtered
   p=ggplot(data=t, aes_string(x='Dataset',y=targetVar,color='Language',fill='Language')) + 
     geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
     scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                        ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
     facet_wrap(~Type ,labeller=label_wrap_gen(width=100),scales='free_x',ncol=plotCols)  +
     ggplot2::ggtitle(title)
   
   ggplot_colors(p,strip_angle=0
                 ,axis.text.y=element_text(size=7)
                 ,axis.text.x=element_text(size=9))

}

All Setups

filtered= merged_data   

plotDataSize(filtered,'Comparing Datsets - All Environments')
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

By Setup

for (s in Setups) {
  message(s)
  filtered= merged_data %>% filter(Setup==s)
  plot(plotDataSize(filtered,paste0('Comparing Datsets - ',s)))

}
## Databricks
## Warning: Removed 332 rows containing non-finite values (stat_boxplot).
## Local VM

## Warning: Removed 354 rows containing non-finite values (stat_boxplot).
## Cluster

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

Comparing Query Types

As expected, the slowest queries are the Aggregate one,while the fastest ones are the Row Operations.Scala outperform PySpark on the row operations, while we don’t see a significant differnece on the other query types

filtered=merged_data  
filtered$Type = forcats::fct_reorder(filtered$Type,filtered[[targetVar]],.fun=median,.desc=F)
p=ggplot(data=filtered, aes_string(x='Type',y=targetVar,color='Language',fill='Language')) + 
  geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape)   +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
  facet_wrap(~Setup,ncol=3) +
  ggplot2::coord_flip() 

ggplot_colors(p,axis.text.x=element_text(size=8)
              ,panel.spacing.x=unit(10,'points')
              ,legend.position='bottom') 
## Warning: Removed 836 rows containing non-finite values (stat_boxplot).

Comparing Queries

#functions
plot_get_significant = function(filtered){
    w<-function(oper){wilcox.test(as.formula(paste0(targetVar,' ~ Language ')),alternative='two.sided'
                                  ,filtered,subset = Operation == oper)$p.value}
    ops = unique(filtered$Operation)
    test=sapply(ops,w)
    keep= ops[which(test < 0.025)]
    data2 = filtered[filtered$Operation %in% keep,]
    if(nrow(data2)==0) {message("No Significant Elements") ; return(NULL)}
    return(data2)
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols){
  if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
  plotData = filtered %>% mutate(Operation = factor(Operation))
  p=  ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) + 
    ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
    geom_jitter(alpha=0.5,size=.5,width=0.1)+
    scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
    facet_wrap(~ Operation,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
    #facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
    ggplot2::ggtitle(title,subtitle=subtitle)
  
  ggplot_colors(p,legend.position = 'bottom'
                ,axis.text.x=element_text(size=8,angle=45,hjust=1)
                ,axis.text.y=element_text(size=8)
                #,strip.background = element_rect(fill='#636363')
                )
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols,facets='Operation'){
  if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
  plotData = filtered %>% mutate(Operation = factor(Operation))
  p=  ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) + 
    ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
    geom_jitter(alpha=0.5,size=.5,width=0.1)+
    scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
    facet_wrap(facets,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
    #facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
    ggplot2::ggtitle(title,subtitle=subtitle)
  
  ggplot_colors(p,legend.position = 'bottom'
                ,axis.text.x=element_text(size=8,angle=45,hjust=1)
                ,axis.text.y=element_text(size=8)
                #,strip.background = element_rect(fill='#636363')
                )
}
plot_operations = function(filtered,title,ncol=plotCols){
  
  for (st in sort(unique(filtered$subType))){
    data=filter(filtered,subType==st)
    p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ',ncol=ncol)
    subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/ncol)*2.5+.5, fig_width=plotCols*3) 
    
    data=plot_get_significant(filter(filtered,subType==st))
    if(!is.null(data))  {
      p=plot_operations_get(data,title=paste0(title,' - ' , st ),subtitle=' (Significant Operations) ',ncol=ncol)
      subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/ncol)*2.5+.5, fig_width=ncol*3) 
    }
  }
}

plot_environments = function(filtered,title,ncol=plotCols){
  
  for (st in sort(unique(filtered$subType))){
    data=filter(filtered,subType==st)
    p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ',ncol=ncol,facets='Setup')
    subchunkify(p, fig_height=ceiling(n_distinct(data$Setup)/ncol)*2.5+.5, fig_width=ncol*3) 
    
  }
}
getTable = function(data){
    t=data %>% group_by(subType,Operation,Language) %>%
      summarise_at(.vars=targetVar,.funs=vars(mean)) %>%
      ungroup() %>%
      spread(key=Language,value=targetVar) %>%
      mutate(delta = Scala/PySpark)
    knitr::kable(t,digits=2,col.names=c('Category','Operation','PySpark (MB/sec)','Scala (MB/sec)','Scala/PySpark'),padding=0,format='markdown')
}

Comparing Row operations

Rows Operations - All Environments:

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Filtering Filter 42.36 52.77 1.25
Filtering Filter Reg Ex 1 58.61 65.07 1.11
Filtering Filter Reg Ex 2 58.95 65.89 1.12
Run/Shift Running Sum 12.17 96.33 7.91
Run/Shift Shift (Lag) 12.10 102.90 8.51
Writing Writing 100 new rows 18.00 32.03 1.78
Writing Writing 1000 new rows 18.05 31.30 1.73
Writing Writing 10000 new rows 15.39 29.28 1.90

Rows Operations - Setup: Databricks

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Filtering Filter 45.42 55.07 1.21
Filtering Filter Reg Ex 1 63.60 66.61 1.05
Filtering Filter Reg Ex 2 63.54 66.08 1.04
Run/Shift Running Sum 11.04 89.81 8.14
Run/Shift Shift (Lag) 10.40 93.57 9.00
Writing Writing 100 new rows 17.00 27.27 1.60
Writing Writing 1000 new rows 16.97 26.63 1.57
Writing Writing 10000 new rows 14.43 22.66 1.57

Rows Operations - Setup: Local VM

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Filtering Filter 26.49 29.93 1.13
Filtering Filter Reg Ex 1 32.80 36.44 1.11
Filtering Filter Reg Ex 2 33.27 38.13 1.15
Run/Shift Running Sum 10.07 69.53 6.90
Run/Shift Shift (Lag) 10.20 78.50 7.70
Writing Writing 100 new rows 13.48 22.56 1.67
Writing Writing 1000 new rows 14.03 22.03 1.57
Writing Writing 10000 new rows 11.51 22.05 1.92

## No Significant Elements

Rows Operations - Setup: Cluster

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Filtering Filter 79.63 93.82 1.18
Filtering Filter Reg Ex 1 119.14 119.26 1.00
Filtering Filter Reg Ex 2 119.99 121.04 1.01
Run/Shift Running Sum 20.79 162.98 7.84
Run/Shift Shift (Lag) 21.49 170.36 7.93
Writing Writing 100 new rows 33.09 60.47 1.83
Writing Writing 1000 new rows 31.84 59.19 1.86
Writing Writing 10000 new rows 28.56 56.97 1.99

## Warning in wilcox.test.default(x = c(34.3061034620522, 37.3931337469713, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(14.4125942386937, 13.3451278278002, :
## cannot compute exact p-value with ties

Comparing Columns operations

Columns Operations - All Environments:

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Joining Full Outer Join 10 Columns 4.82 7.10 1.47
Joining Full Outer Join 5 Columns 3.02 4.72 1.57
Joining Inner Join 10 Columns 6.98 9.56 1.37
Joining Inner Join 5 Columns 5.91 7.85 1.33
Joining Left Outer Join 10 Columns 6.49 9.16 1.41
Joining Left Outer Join 5 Columns 4.47 6.17 1.38
Mathematics Mathematical Operation on Columns 97.35 105.61 1.08
Merging Merge 10 columns into 1 14.25 26.11 1.83
Merging Merge 2 columns into 1 16.27 27.90 1.71
Merging Merge 5 columns into 1 15.21 27.21 1.79
Sorting Sorting Asc 1 column 8.54 11.18 1.31
Sorting Sorting Asc 10 column 7.95 10.56 1.33
Sorting Sorting Asc 5 column 8.53 10.84 1.27
Sorting Sorting Desc 1 column 8.34 11.54 1.38
Sorting Sorting Desc 10 column 8.04 10.67 1.33
Sorting Sorting Desc 5 column 8.36 11.05 1.32
Splitting Split 1 Column into 10 10.61 20.04 1.89
Splitting Split 1 Column into 5 12.30 23.38 1.90

## No Significant Elements

Columns Operations - Setup:Databricks

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Joining Full Outer Join 10 Columns 4.39 5.11 1.16
Joining Full Outer Join 5 Columns 1.86 3.29 1.77
Joining Inner Join 10 Columns 5.41 7.38 1.36
Joining Inner Join 5 Columns 4.68 6.22 1.33
Joining Left Outer Join 10 Columns 5.24 6.79 1.30
Joining Left Outer Join 5 Columns 2.40 4.17 1.74
Mathematics Mathematical Operation on Columns 99.81 98.28 0.98
Merging Merge 10 columns into 1 14.20 21.86 1.54
Merging Merge 2 columns into 1 15.49 23.58 1.52
Merging Merge 5 columns into 1 14.42 22.84 1.58
Sorting Sorting Asc 1 column 7.07 7.20 1.02
Sorting Sorting Asc 10 column 6.27 7.12 1.14
Sorting Sorting Asc 5 column 6.94 7.44 1.07
Sorting Sorting Desc 1 column 6.94 7.54 1.09
Sorting Sorting Desc 10 column 6.30 6.95 1.10
Sorting Sorting Desc 5 column 6.44 6.83 1.06
Splitting Split 1 Column into 10 12.56 19.05 1.52
Splitting Split 1 Column into 5 13.63 21.08 1.55

## Warning in wilcox.test.default(x = c(2.37938222506204, 2.05855504313115, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.74957457405481, 1.37888996360214, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.14419792326157, 0.947824107373032, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(2.85941124924865, 2.97915639123432, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(3.01828145151815, 2.11499141749447, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.32239706378069, 0.781816877438383, :
## cannot compute exact p-value with ties

## No Significant Elements

## No Significant Elements

Columns Operations - Setup:Local VM

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Joining Full Outer Join 10 Columns 3.34 3.65 1.09
Joining Full Outer Join 5 Columns 2.87 3.48 1.21
Joining Inner Join 10 Columns 6.42 6.73 1.05
Joining Inner Join 5 Columns 5.34 5.15 0.96
Joining Left Outer Join 10 Columns 5.68 5.87 1.03
Joining Left Outer Join 5 Columns 5.10 5.52 1.08
Mathematics Mathematical Operation on Columns 65.14 78.17 1.20
Merging Merge 10 columns into 1 10.46 20.13 1.92
Merging Merge 2 columns into 1 12.02 19.84 1.65
Merging Merge 5 columns into 1 11.53 20.09 1.74
Sorting Sorting Asc 1 column 7.46 10.26 1.37
Sorting Sorting Asc 10 column 7.07 9.48 1.34
Sorting Sorting Asc 5 column 7.49 9.51 1.27
Sorting Sorting Desc 1 column 7.18 10.70 1.49
Sorting Sorting Desc 10 column 7.06 9.61 1.36
Sorting Sorting Desc 5 column 7.70 10.67 1.39
Splitting Split 1 Column into 10 4.00 10.00 2.50
Splitting Split 1 Column into 5 5.73 15.00 2.62

## Warning in wilcox.test.default(x = c(1.23965666298723, 1.10711647223027, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(6.52882705721958, 6.927836689191,
## 6.99238882889498, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.28220661253785, 5.55924105941596, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.68869908681748, 5.59325942002566, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(1.16502398340096, 1.27786882785268, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = c(4.8713464409426, 6.47614097686192, :
## cannot compute exact p-value with ties
## No Significant Elements

## No Significant Elements

Columns Operations - Setup:Cluster

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Joining Full Outer Join 10 Columns 10.56 14.53 1.38
Joining Full Outer Join 5 Columns 6.93 8.84 1.28
Joining Inner Join 10 Columns 13.37 16.76 1.25
Joining Inner Join 5 Columns 11.29 13.80 1.22
Joining Left Outer Join 10 Columns 12.62 17.17 1.36
Joining Left Outer Join 5 Columns 8.82 10.82 1.23
Mathematics Mathematical Operation on Columns 181.72 175.16 0.96
Merging Merge 10 columns into 1 24.96 46.60 1.87
Merging Merge 2 columns into 1 30.05 52.64 1.75
Merging Merge 5 columns into 1 27.42 50.20 1.83
Sorting Sorting Asc 1 column 15.06 20.96 1.39
Sorting Sorting Asc 10 column 14.48 19.61 1.35
Sorting Sorting Asc 5 column 15.19 20.28 1.34
Sorting Sorting Desc 1 column 14.90 21.21 1.42
Sorting Sorting Desc 10 column 14.89 20.24 1.36
Sorting Sorting Desc 5 column 14.74 20.26 1.37
Splitting Split 1 Column into 10 24.46 42.10 1.72
Splitting Split 1 Column into 5 27.51 44.74 1.63

## Warning in wilcox.test.default(x = c(43.9746927716948, 43.3457651552285, :
## cannot compute exact p-value with ties
## No Significant Elements

Comparing Aggregate operations

Scala is faster than PySpark

Aggregate Operations - All Environments:

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Grouping GroupBy 1 column 7.99 9.63 1.21
Grouping GroupBy 10 columns 2.82 4.56 1.61
Grouping GroupBy 5 columns 4.74 6.50 1.37
Ranking Ranking by Group 12.25 19.15 1.56

Aggregate Operations - Setup: Databricks

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Grouping GroupBy 1 column 9.05 9.41 1.04
Grouping GroupBy 10 columns 3.07 4.07 1.33
Grouping GroupBy 5 columns 5.09 6.09 1.20
Ranking Ranking by Group 12.12 16.61 1.37

Aggregate Operations - Setup: Local VM

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Grouping GroupBy 1 column 2.59 4.78 1.85
Grouping GroupBy 10 columns 1.27 2.51 1.97
Grouping GroupBy 5 columns 1.81 3.13 1.73
Ranking Ranking by Group 7.48 11.78 1.57

Aggregate Operations - Setup: Cluster

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Grouping GroupBy 1 column 20.61 19.79 0.96
Grouping GroupBy 10 columns 6.58 9.64 1.46
Grouping GroupBy 5 columns 12.12 14.05 1.16
Ranking Ranking by Group 25.88 38.97 1.51

Comparing Mixed operations

Scala is faster than PySpark

Mixed Operations - All Environments:

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Pivots Pivot 1 Rows and 1 Column 34.94 37.75 1.08
Pivots Pivot 10 Rows and 1 Column 13.03 17.39 1.33
Pivots Pivot 5 Rows and 1 Column 16.80 20.46 1.22

Mixed Operations - Setup: Databricks

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Pivots Pivot 1 Rows and 1 Column 31.24 32.67 1.05
Pivots Pivot 10 Rows and 1 Column 11.75 13.32 1.13
Pivots Pivot 5 Rows and 1 Column 14.29 15.18 1.06

## No Significant Elements

Mixed Operations - Setup: Local VM

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Pivots Pivot 1 Rows and 1 Column 24.48 28.52 1.17
Pivots Pivot 10 Rows and 1 Column 7.72 11.61 1.50
Pivots Pivot 5 Rows and 1 Column 9.96 14.73 1.48

Mixed Operations - Setup: Cluster

Category Operation PySpark (MB/sec) Scala (MB/sec) Scala/PySpark
Pivots Pivot 1 Rows and 1 Column 73.14 66.36 0.91
Pivots Pivot 10 Rows and 1 Column 31.02 37.09 1.20
Pivots Pivot 5 Rows and 1 Column 41.89 42.48 1.01

## No Significant Elements

Comparing Runs operations

slight improvement with additonal runs (not statistically significant)

filtered=data_raw 
filtered$Operation = forcats::fct_reorder(filtered$RunID,filtered[[targetVar]],.fun=median,desc=F)
## Warning: Some components of ... were not used: desc
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) + 
  geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape)  +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) 
ggplot_colors(p)
## Warning: Removed 1025 rows containing non-finite values (stat_boxplot).

By Setup

p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) + 
  geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape)  +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits))  +
  facet_wrap(~Setup,ncol=1)
ggplot_colors(p)
## Warning: Removed 1025 rows containing non-finite values (stat_boxplot).